global root_dir = "`1'"

include "$root_dir/code/config/config.do"


cap noi log using ${log_dir}/load_data_EUklems.log, replace name(dat)

*Handle empty arguments
global arg1 = cond("`2'" == "___EMPTY___", "", "`2'")
global arg2 = cond("`3'" == "___EMPTY___", "", "`3'")
global arg3 = cond("`4'" == "___EMPTY___", "", "`4'")
global arg4 = cond("`5'" == "___EMPTY___", "", "`5'")

if "$arg1" != "" {
    global weight_category "$arg1"
    di "Weight category: ${weight_category}"
}

if "$arg2" != "" {
    global weight_versions "$arg2"
    di "Weight versions: ${weight_versions}"
}

if "$arg3" != "" {
    global weight_window "$arg3"
    di "Weight window: ${weight_window}"
}

if "$arg4" != "" {
	global wtype "$arg4"
}
di "${wtype}"
capture noi {



set more off

***********************************************
*************** LOAD EUklems DATA  ************
***********************************************
* Some Notes: 

* We import here the EUklems wage, deflator and employment data.
* The WIOD will be our baseline and EUklems data is used as robustness check.
* EUklems  contains average low-skill wages, high-skill wages: until 2005 not back 
* until 1970 for all; average wages: 1970-2005 for all countries.

* Complications regarding exchange rates

* EUklems:
* For Euro countries: 
* In Euros. For years before joining the Euro the conversion is made using the 
* conversion rate at the year of joining. The list is as follows: 
* Austria: 13.7603, euros from 1999 onwards
* Belgium: 40.3399, euros from 1999 onwards
* Spain: 166.386, euros from 1999 onwards
* Finland: 5.94573, euros from 1999 onwards
* France: 6.55957, euros from 1999 onwards
* Germany: 1.95583, euros from 1999 onwards 
* Greece: 340.750, euros from 2001 onwards
* Ireland: 0.787564, euros from 1999 onwards
* Italy: 1936.27, euros from 1999 onwards 
* -> That makes things relatively straightforward such that inflation numbers 
* deflate correctly and you just switch exchange rate in the cut year to get 
* appropriate numbers in dollars.

* We seek to include sectoral wages. EUklems 2008 uses a system that assigns D to manufacturing and 15-37 as subcategories (NACE Rev 1).
* WIOD instead uses C10-C33 and labels manufacturing C (NACE Rev 2).
* There is a correspondence between the two. It is not exact, but we will use it. 

* Set of countries
* Countries that have low-skill wages
local country_list "irl can kor uk swe svn svk prt pol nld lux jpn ita hun grc ger fra fin  esp dnk dew cze  bel aut aus usa-sic"
local country_list_2017 "AT BE BG CY CZ DE DK EE EL ES FI FR HR HU IE IT LT LU LV NL PL PT RO SE SI SK UK US" 


*Some countries (e.g. CANADA) have no statistics for the TOT or D categories, alas we need to handle them seperately.
*Comment PS:why does this file not really use tempfiles??? all are saved on disk "permamnently" and then erased.... this is cumbersome
**********************************************************************************
* 1. Import EUklems data Basic (standard industry classification + manufacturing)*
**********************************************************************************

* Import by country and by variable. 
* Set empty dataset to merge with.
clear
set obs 1
gen code=""
gen year=0
gen country=""
save ${tmp_dir}/temp_EUklems.dta, replace

* 1.a. Run through countries.
foreach ctr in `country_list' {		

    clear 
    disp "`ctr'"
    set obs 1 
    gen code = ""
    gen year = 0
    sleep 500
    save ${tmp_dir}/temp.dta, replace

    * 1.b. Run through variables.
    foreach var in LAB LABLS LABMS LABHS H_EMP H_HS H_MS H_LS VA VA_QI  { 		
        * EUKLEMS Database | March 2008 Release |
        * https://dataverse.nl/dataset.xhtml?persistentId=doi:10.34894/MGSB4H | Downloaded August 30th 2018 
        import excel using ${mow_data_raw}/basic_files/`ctr'_output_08I.xls, sheet("`var'") firstrow clear
        *Some excel spread sheets are included with junk-obs
        drop if code==""		

        * Reshape. 
        reshape long _, i(code) j(year)
        rename _ `var'
        merge 1:1 code year using ${tmp_dir}/temp.dta
        drop _merge
        sleep 1500
        save ${tmp_dir}/temp.dta, replace
    }
    
    *End variable run 1.a.
    gen country="`ctr'"

    * Slovakia joined the EU in 2005, thus since EUklems. We convert the currency (exchange rate 30.126).
    replace LAB=LAB/30.1260  if country=="svk"
    replace VA=LAB/30.1260 if country=="svk"

    * The italian low-wage data looks really weird. Lets ignore it and pretend
    * like we don't have LABLS or H_LS in EUklems.
    replace LABLS=. if country=="ita"
    replace H_LS=. if country=="ita"

    *Generate the wage measurements. 
    gen ls_wage=(LAB*LABLS)/(H_EMP*H_LS)
    gen ms_wage=(LAB*LABMS)/(H_EMP*H_MS)
    gen hs_wage=(LAB*LABHS)/(H_EMP*H_HS)
    gen lms_wage=(LAB*(LABLS+LABMS))/(H_EMP*(H_LS + H_MS))

    gen va_emp=VA/H_EMP

    * Label the variables. 
    label var LAB "Labour compensation"
    label var LABLS "Low-skilled labour compensation  (share in total labour compensation)"
    label var LABMS "Med-skilled labour compensation  (share in total labour compensation)"
    label var LABHS "High-skilled labour compensation  (share in total labour compensation)"
    label var H_EMP "Total hours worked by persons engaged (millions)"
    label var H_LS "Hours worked by low-skilled persons engaged (share in total hours)"
    label var H_MS "Hours worked by Med-skilled persons engaged (share in total hours)"
    label var H_HS "Hours worked by high-skilled persons engaged (share in total hours)"
    label var ls_wage "Low-skill wages"
    label var ms_wage "Middle-skilled wages"
    label var hs_wage "High-skill wages"
    label var lms_wage "Low and middle-skill wages"
    label var va_emp "Value added per hour worked" 
    append using ${tmp_dir}/temp_EUklems.dta
    sleep 1300
    save ${tmp_dir}/temp_EUklems.dta, replace
}

* End country run 1.b.  

use ${tmp_dir}/temp_EUklems.dta, clear
drop ls_wage ms_wage hs_wage lms_wage va_emp
tempfile temp_reduced
save `temp_reduced', replace

*go to ctry-year format
use ${tmp_dir}/temp_EUklems.dta, clear
drop if code==""
keep code country year ls_wage ms_wage hs_wage lms_wage va_emp

*reashape to make consistent panel
reshape wide ls_wage ms_wage hs_wage lms_wage va_emp, i(year country) j(code) string
reshape long ls_wage ms_wage hs_wage lms_wage va_emp, i(year country) j(code) string
*replace missing values
merge 1:1 code year country using `temp_reduced'
drop _merge

		
save ${tmp_dir}/temp_EUklems_basic.dta, replace

cap erase ${tmp_dir}/temp.dta
cap erase ${tmp_dir}/temp_EUklems.dta

* End import Euklems Basic. 

**************************************************************************************************
* 2. Import EUKlems data additional (includes measures such as code=="MARKT" for market economy) *
**************************************************************************************************

* Set empty dataset to merge with.
clear
set obs 1
gen code=""
gen year=0
gen country=""
save ${tmp_dir}/temp_EUklems.dta, replace

* 2.a Run through countries.
foreach ctr in `country_list' {									
clear 
set obs 1 
gen code=""
gen year=0
sleep 2000
save ${tmp_dir}/temp.dta, replace

* 2.b. Run through variables. (Thus, import the data by variable and country.)	
foreach var in LAB LABLS LABMS LABHS H_EMP H_HS H_MS H_LS VA VA_QI { 	
        *EUKLEMS Database | March 2008 Release | 
        *https://dataverse.nl/dataset.xhtml?persistentId=doi:10.34894/MGSB4H | Downloaded August 30th 2018 
        import excel using ${mow_data_raw}/Additional_files/`ctr'_output_alt_08I.xls, sheet("`var'") firstrow clear
        *Some excel spread sheets are included with junk-obs
        drop if code==""		
        * Reshape the data. 
        reshape long _, i(code) j(year)
        rename _ `var'
        merge 1:1 code year using ${tmp_dir}/temp.dta
        drop _merge
        * I let the computer sleep for a second to avoid overwriting on the same file on Surface.
        sleep 1500
        save ${tmp_dir}/temp.dta, replace
    }
    * End variable run 2.a. 
    gen country="`ctr'"

    * Slovakia joined the EU in 2005, thus since EUklems. We convert the currency (exchange rate 30.126).
    replace LAB=LAB/30.1260  if country=="svk"
    replace VA=LAB/30.1260 if country=="svk"

    * Also drop Italy in additional (before we only did in Basic file 
    * which is inconsistent)
    * The italian low-wage data looks really weird. I will basically just pretend 
    * like we don't have LABLS or H_LS in EUklems.
    replace LABLS=. if country=="ita"
    replace H_LS=. if country=="ita"

    *Generate the wage measurements. 
    gen ls_wage=(LAB*LABLS)/(H_EMP*H_LS)
    gen ms_wage=(LAB*LABMS)/(H_EMP*H_MS)
    gen hs_wage=(LAB*LABHS)/(H_EMP*H_HS)
    gen lms_wage=(LAB*(LABLS+LABMS))/(H_EMP*(H_LS + H_MS))
    gen va_emp=VA/H_EMP

    * Label the variables
    label var LAB "Labour compensation"
    label var LABLS "Low-skilled labour compensation  (share in total labour compensation)"
    label var LABMS "Med-skilled labour compensation  (share in total labour compensation)"
    label var LABHS "High-skilled labour compensation  (share in total labour compensation)"
    label var H_EMP "Total hours worked by persons engaged (millions)"
    label var H_LS "Hours worked by low-skilled persons engaged (share in total hours)"
    label var H_MS "Hours worked by Med-skilled persons engaged (share in total hours)"
    label var H_HS "Hours worked by high-skilled persons engaged (share in total hours)"
    label var ls_wage "Low-skill wages"
    label var ms_wage "Middle-skilled wages"
    label var hs_wage "High-skill wages"
    label var lms_wage "Low and middle-skill wages"
    label var va_emp "Value added per hour worked" 
    append using ${tmp_dir}/temp_EUklems.dta
    sleep 1500
    save ${tmp_dir}/temp_EUklems.dta, replace
}
* End country run 2.b. 


save ${tmp_dir}/temp_EUklems_add.dta, replace

cap erase ${tmp_dir}/temp.dta

cap erase ${tmp_dir}/temp_EUklems.dta

* End importing Euklems additional 


***************************************************
* 3. Combine Euklems Basic with Euklems additional *
****************************************************


* 3.a Combine Euklems Basic with Euklems additional (because both include "codes" which the other does not).
use ${tmp_dir}/temp_EUklems_basic.dta, clear 
append using ${tmp_dir}/temp_EUklems_add.dta

* This means that some industries will be counted double: Remove those
* First sort on the wage to make sure we only take the non-missing version.
bysort country year code (ls_wage): gen mgo=_n if (country!="jpn" | year!=1970)

* Update Selna April 19: Problem with JP - we are mixing inputs for 1970 from additional and basic 
* file (for other countries probably too, but there we have same values in both 
* additional and basic - but for JP in 1970 in additional file values are missing)
bysort country year code (LABHS): gen jpn=_n if (country=="jpn" & year==1970)



drop if mgo==2 
drop if jpn==2 
drop mgo jpn
drop Y Z AA AB AC AD AE AF AG AH AI AJ AK AL AM AN

save ${tmp_dir}/temp_EUklems.dta, replace



* 3.b. The Basic file for Canada does not include manufacturing data so we create that
* as a combination of "MexElec (manufacturing excluding electricity)" and "30t33 
* (electricity)" from the additional file
use ${tmp_dir}/temp_EUklems.dta, clear
keep if country=="can" & (code=="MexElec" | code=="30t33")

* Value added weighted observations. 
* First calculate the payment and hours worked over the 3 skill levels for both codes. 
gen H_hours=H_HS*H_EMP
gen M_hours=H_MS*H_EMP
gen L_hours=H_LS*H_EMP
gen LM_hours=(H_LS + H_MS)*H_EMP
gen H_pay=LAB*LABHS
gen M_pay=LAB*LABMS
gen L_pay=LAB*LABLS
gen LM_pay=LAB*(LABLS+LABMS)

* Sum those over the two codes and calculate the wages for the three skill levels using those sums.
collapse (sum) H_hours M_hours L_hours LM_hours H_pay M_pay L_pay LM_pay VA LAB H_EMP (first) country, by(year)
gen ls_wage=L_pay/L_hours
gen ms_wage=M_pay/M_hours
gen hs_wage=H_pay/H_hours
gen lms_wage = LM_pay / LM_hours
gen va_emp=VA/H_EMP
drop H_hours M_hours L_hours LM_hours H_pay M_pay L_pay LM_pay
save ${tmp_dir}/temp.dta, replace

* Also sum up the value added variables.
use ${tmp_dir}/temp_EUklems.dta, clear
keep if country=="can" & (code=="MexElec" | code=="30t33")
sort code year
by code: gen VA_QI_g=VA_QI/VA_QI[_n-1]
gen VA_QI_g_VA=VA_QI_g*VA[_n-1]
gen VA_past=VA[_n-1]
sort year code
collapse (sum) VA_QI_g_VA VA_past, by(year)
gen mgo=VA_QI_g_VA/VA_past
gen VA_QI=100
replace VA_QI=VA_QI[_n-1]*mgo if _n>1
drop mgo
merge 1:1 year using ${tmp_dir}/temp.dta
drop VA_QI_g_VA VA_past _merge
gen code="D"
append using ${tmp_dir}/temp_EUklems.dta

*WE do not renormalize the VA_QI to 100 in 2005 here yet because otherwise we will
* use data from CAN. We will do it in the merge_all_data.do. 

label var ls_wage "Low-skill wages"
label var ms_wage "Middle-skill wages"
label var hs_wage "High-skill wages"
label var lms_wage "Low and middle-skill wages"
label var va_emp "Value added per hour worked"
label var code "Sector code"
label var LAB "Labour compensation"
label var VA "Gross value added"
label var VA_QI "Gross value added index"
label var H_EMP "Total hours worked"
label var country ""

save ${mow_data_proc}/EUklems_import.dta, replace

cap erase ${tmp_dir}/temp.dta


cap erase ${tmp_dir}/temp_EUklems.dta

}
if _rc == 0 {
    display "Execution finished successfully."
}
else {
    display "Execution finished with errors."
}

cap log close dat
